# This file contains the R code that was used to implement the estimators and functionals in "A new test of Independence", JMVA (2017) # As a fully working example the code herein can reproduce Figure 4b of the paper. # The same functions (with different input data) can be used in reproducing other figures of the same paper # More general use of the estimates is also feasible by supplying the corresponding data set. Use of the functions is self - explanatory and is further facilitated by the comments at he preamble of each function. # Bn and Dn are correct and use cutt-off points according to the literature: # Dn uses the Harrel (2001) - Hmisc package computation and cutt off points # Bn uses the empirical cutt off points of Hudholkar and Wilding (2003, The statistician (JRSS D)) # Tn is not the max version but uses on bandwidth instead (not the max statistic as it resuts from many bandwidths) # The bandwidth used is obtained by the glkern package and corresponds to global (MISE optimal) bandwidth rule for # fixed design kernel regression (according to Prakash's email we are focusing on fixed regressors, is.rand=FALSE for glkerns) # N.B. : Yu and Jones, JASA use kernel regression optimal bandwidth as a starting point to further refine and obtain # quantile regression optimal bandwidth - this is not implemented yet. #Moreover: the present routine is specific to sample size = 50 for the Tn statistic, i.e. the empirical quantile and subsequent calcs # use n=50. options(echo=FALSE) rm(list=ls()) library("Hmisc") library("lokern") library("nor1mix") library("kSamples") library("coin") library("copula") fSqArgSwitch<-function(x, dist, p1=0,p2=1) { switch(dist, gaussian= (dnorm(x, p1,p2))^2 , exponential = (dexp(x,p1))^2) } fArgSwitch<-function(x, dist, p1=0,p2=1) { switch(dist, gaussian= dnorm(x, p1,p2) , exponential = dexp(x,p1)) } fRKswitch<-function(kernel) { switch(kernel, rectangular= 0.5 , gaussian = 1/(2*sqrt(pi)) ) } fConvolution<-function(kernel) { switch(kernel, rectangular= 0.09375 , gaussian = 1/(2*sqrt(2*pi)) #convolution of standard normal density 3 times with it self evaluated at 0. ) } Rchatfxhat<-function(xin, yin, h, n, DensityEst) { # p<-c(1:100)/100 #integrand<-sapply(p, function(p, xin, yin, h, N) (pcf.kern.quan(xin,yin,h,N=n, p)$value)^2 * DensityEst, xin, yin, h, N) integrand<-function(p) {qnorm( p)} chat<-integrate(integrand, 0.00006, 1)$value #mean(integrand ) mean(chat * 0.2820948)#(1/6)*sqrt(3)/pi ) chat } hopt<-function(xin, yin, dist, kernel, p1, p2 ) { n<-length(xin) h<-bw.nrd(xin) #bw.SJ(xin) DensityEst<- kde(xin,xin, h, gaussian) Rhatfx<-mean(DensityEst) RhatfxUse<-Rhatfx^{3/2} Intfx<- mean(DensityEst^2) #estimate \int f_X^3(x)\,dx by \mathbb E f_X^2(x) = \frac{1}{n}\sum_{i=1}^n f_X^2(X_i) #for the normal distribution the exact value = (1/6)*sqrt(3)/pi, check with the estimate RK<- 1/(2*sqrt(pi)) #fRKswitch(kernel) #R(K)=\int_{-\infty}^{+infty} K^2(x)\,dx, computation done in maple, currently supports gaussian, rectangular RKuse<-RK^{3/2} ConvK<- 1/(2*sqrt(2*pi)) #fConvolution(kernel) term1 <- (sqrt(2)*ConvK*Intfx)/(3*RKuse*RhatfxUse) term1use<- term1^{-1/2} SigmaSq <- 16/45 * RK * Rhatfx Nu2<-Intfx #mean(fSqArgSwitch(xin, dist, p1,p2)) # \sim \frac{1}{n}\sum_{i=1}^n f_X(X_i) RDeltaR<- Rchatfxhat(xin,yin, h, n, DensityEst) term2<- (n*RDeltaR)/(SigmaSq*sqrt(2*Nu2*RK)) term2use <- term2^{-3/2} banduse<- term1use * term2use #return proposed bandwidth: for comparison, the method used so far returns bandwidths in the range of 0.4 to 1.7 banduse } kgaussian<-function(x) { result <- 0.3989 * exp(-0.5*x*x) result } "gaussian" <- function(x) { dnorm(x)} "kde"<-function(xin, xout, h, kfun) { n<- length(xin) arg1<-(sapply(xout, "-", xin))/h arg2<-kfun(arg1) arg3<-colSums(arg2) arg3/(n*h) } int1 <- function(x) { dnorm(x)^2} F1<-integrate(int1, lower = -Inf, upper = Inf)$value int2<-function(x) { dnorm(x)^2} F2<-integrate(int2, lower = -Inf, upper = Inf)$value T.n2<-function(xi, yi, kernel, h) { n<- length(xi) arg1<-(sapply(xi, "-", xi))/h arg2<-kgaussian(arg1) # K{(x_i-x_j)/h} arg3<-rank(yi) arg3.1<- sapply(arg3, "pmin", arg3) arg3.2 <- sapply(arg3, "pmax", arg3) arg3.1 <- (arg3.1^2)/(n+1)^2 arg3.2 <- ((n- arg3.2)^2)/(n+1)^2 - 1/3 arg4<-arg3.1 + arg3.2 # (arg3.3+arg3.4)/((n+1)^2) - 1/3 #(min(Rni,Rnj)^2 + (n-max(Rni,Rnj))^2)/n^2 - 1/3 arg5<- arg2 * arg4 arg5[lower.tri(arg5, diag=T)]<-0 # set all elements for which 1<= ixiuse & yi<=yiuse)) n3<- length(which(xi<=xiuse & yi>yiuse)) n4<- length(which(xi>xiuse & yi>yiuse)) #cat("\n", n1+n2+n3+n4) c(n1,n2,n3,n4) },xi,yi) nMat<-matrix(n1n2n3n4, nrow=n, ncol=4, byrow=T) #nMat<- t(n1n2n3n4) sum((nMat[,1]*nMat[,4]-nMat[,2]*nMat[,3])^2)/n^4 } D.n<-function(xi,yi) { n<-length(xi) abc<-sapply(1:n, function(i, xi,yi) { xiuse<-xi[i] yiuse<-yi[i] ause<- xiuse-xi >0 #xiuse = X_a, xi=X_b - for all b ause1<-replace(ause, F, 0) ause<-sum(ause1)-1 buse<- yiuse-yi >0 #yiuse = Y_a, yi=Y_b - for all b buse1<-replace(buse, F, 0) buse<-sum(buse1)-1 cuse<- sum(ause1 * buse1)-1 c(ause, buse, cuse) },xi,yi) abcMat<- t(abc) A1<-as.double(abcMat[,1]) B1<-as.double(abcMat[,2]) C1<-as.double(abcMat[,3]) A<- sum(A1*(A1-1)*B1*(B1-1)) B<- sum((A1-1)*(B1-1)*C1) C<- sum((C1-1)*C1) n<-as.double(n) (A-2*(n-2)*B+(n-2)*(n-3)*C)/(n*(n-1)*(n-2)*(n-3)*(n-4)) } PowerGraphs1<-function(SampleSize, cutoff,hfactor, F1) { correl<-0 correluse<-0 PowerMat<-matrix(0,nrow=10000, ncol=1) CorMat<-matrix(0,nrow=10000, ncol=1) for(j in 1:10000) { cl2 <- frankCopula(0.01, dim = 2) # param in (0.01, 25) v<-rCopula(SampleSize, cl2) x1<-v[,1] x2<-v[,2] # testcor<-abs(cor(x1, x2)) # while(testcor>cutoff) # { # x1<-rnorm(SampleSize) # x2<-rnorm(SampleSize) # testcor<-abs(cor(x1, x2)) # } x1<- x1 #generate independent sample sizes #rnorm(SampleSize, 0, 1) #AllSamples[,2*j-1] # y1<- x2 #rnorm(SampleSize, 0, 1) #AllSamples[,2*j] # ro<-correluse/100 # correlation z1<- ro*x1+sqrt(1-ro^2)*y1 # create the dependent to x1 sample band2<-hfactor*glkerns(x1,z1, is.rand=FALSE)$bandwidth #band2<-hfactor * hopt(x1, y1, "gaussian","gaussian", 0, 1) #t1<- T.n2(x1,z1, "kgaussian", band1) t2<- T.n2(x1,z1, "kgaussian", band2) CorMat[j,]<-cor(x1,z1) #PowerMat[j,1]<- t1 # c(b1,d1, t1) #store the test statistic values PowerMat[j,1]<- t2 } # cTn1<-PowerMat[,1] cTn2<-PowerMat[,1] # Out1<-quantile(cTn1, cutoff) Out2<-quantile(cTn2, cutoff) # c(Out1, Out2) Out2 } PowerGraphs<-function(SampleSize=100, iterations, hfactor, F1, F2, quse, BnCutoff, Dna.a) { duse<-indepTestSim(SampleSize, 2, 2, 1000, verbose = FALSE) correl<-seq(0.01,50, length=iterations) #range of correlations to examine TBDmatrix<-matrix(0, nrow=iterations, ncol=8) #initialize the matrix which will contain the powers - first column is our test, second the Bn and third the Dn for(i in 1:iterations) { correluse<-correl[i] PowerMat<-matrix(0,nrow=100, ncol=7) CorMat<-matrix(0,nrow=100, ncol=1) for(j in 1:100) { #x1<-rnorMix(SampleSize, MW.nm11) #x2<-rnorMix(SampleSize, MW.nm11) cl2 <- frankCopula(correluse, dim = 2) # param in (0.01, 25) v<-rCopula(SampleSize, cl2) x1<-v[,1] z1<-v[,2] # testcor<-abs(cor(x1, x2)) # while(testcor>cutoff) # { # x1<- rnorm(SampleSize) # x2<- rnorm(SampleSize) # # testcor<-abs(cor(x1, x2)) # } #x1<- x1 #y1<- x2 ro<-correluse/100 # correlation #z1<- ro*x1+sqrt(1-ro^2)*y1 # create the dependent to x1 sample b1<-B.n(x1,z1) d1<- hoeffd(x1,z1)$P[1,2] #D.n(x1,z1)# hoeffd(x1,z1)$P[1,2] e1<- cor.test(x1, z1, method="spearman", distribution="approximate")$p.value # SPEARMAN - NOT USED IN THIS SCRIPT!!!!!#qn.test(x1,z1, test="vdW", method = "simulated", dist = FALSE, Nsim = 1000)$qn[2] #simulated (finite sample) p-val. van der Wearden Statisic f1<- pvalue(fisyat_test(x1~ z1, distribution="approximate")) #qn.test(x1,z1, test="vdW", method = "simulated", dist = FALSE, Nsim = 1000)$qn[3] #simulated (finite sample) p-val. van der Wearden Statisic itg<-indepTest(cbind(x1,z1), duse, alpha=0.05)$global.statistic.pvalue # cat(d1, " ", hoeffd(x1,z1)$D[1,2], "\n") band2<-hfactor*glkerns(x1,z1, is.rand=FALSE)$bandwidth #band2<-hfactor* hopt(x1, y1, "gaussian","gaussian", 0, 1) m.data<-data.frame(x1,z1) ff1<-independence_test(x1 ~ z1, distribution="approximate", data = m.data, ytrafo = function(data) savage_trafo(data), xtrafo = function(data) savage_trafo(data)) #ff1<- savage_test(object, conf.int = FALSE, conf.level = 0.95, ...)#independence_test(x1 ~ y1, data = m.data, xtrafo = function(data) savage_trafo(data)) j1<- pvalue(ff1) #t1<- T.n2(x1,z1, "kgaussian", band1) t2<- T.n2(x1,z1, "kgaussian", band2) CorMat[j,]<-cor(x1,z1) PowerMat[j,]<- c(b1, d1, e1, f1, j1, itg, t2) #store the test statistic values #cat(c(b1,d1, e1, t2), "\n") } aBn<-PowerMat[,1] bDn<-PowerMat[,2] cS<-PowerMat[,3] cW<-PowerMat[,4] cSav<-PowerMat[,5] cItg<-PowerMat[,6] cTn1<-PowerMat[,7] # cTn2<-PowerMat[,4] a<-length(which(aBn > BnCutoff) )# for n=40, a=0.95 the cutt-off point (Mudholkar and Wilding (2003) is 0.0649, for n=50 is 0.0637, n=30 is 0.0665) b<-length(which(bDnDna.a)) # c<-length(which(cTn1> quse[1] ))#que1 d<- length(which(cW quse[1] ))#que2 TBDmatrix[i,]<-c(colMeans(CorMat), a,b, c, d, e, pitg, j) } TBDmatrix } Simulation<-function(Sample, iterations, It2, hfactor, Tn.Conf, cutB.n, CutD.n) { matrixAllBn<-matrix(nrow=iterations, ncol=It2) matrixAllDn<-matrix(nrow=iterations, ncol=It2) matrixAllSn<-matrix(nrow=iterations, ncol=It2) matrixAllWn<-matrix(nrow=iterations, ncol=It2) matrixAllSavn<-matrix(nrow=iterations, ncol=It2) matrixAllitg<-matrix(nrow=iterations, ncol=It2) matrixAllTn1<-matrix(nrow=iterations, ncol=It2) #matrixAllTn2<-matrix(nrow=iterations, ncol=It2) matrixCor<-matrix(nrow=iterations, ncol=It2) quse<- PowerGraphs1(SampleSi, Tn.Conf, hfactor, F1) cat("for n=", Sample, "the ", Tn.Conf, " sign. level value is ", quse[1]) for(i in 1:It2) { cat("\ni= ", i) arg1<-PowerGraphs(SampleSi,iterations, hfactor, F1, F2, quse, cutB.n, CutD.n) matrixCor[,i]<-arg1[,1] matrixAllBn[,i]<-arg1[,2] matrixAllDn[,i]<-arg1[,3] matrixAllWn[,i]<-arg1[,4] matrixAllSn[,i]<-arg1[,5] matrixAllSavn[,i]<-arg1[,6] matrixAllitg[,i]<-arg1[,7] matrixAllTn1[,i]<-arg1[,8] #matrixAllTn2[,i]<-arg1[,5] } xout<- rowMeans(matrixCor) #seq(1,100, length=iterations)/100 plot(xout, rowMeans(matrixAllBn), type="l", ylim=c(0,100), ylab="Power", xlab="Correlation") lines(xout, rowMeans(matrixAllDn), lty=2, col=1) lines(xout, rowMeans(matrixAllWn),lty=3, col=1) lines(xout, rowMeans(matrixAllSn),lty=4, col=1) lines(xout, rowMeans(matrixAllSavn),lty=5, col=1) lines(xout, rowMeans(matrixAllitg),lty=6, col=1) lines(xout, rowMeans(matrixAllTn1),lty=7, col=1) #write.csv(matrixAllBn, file="matrixAllBn.csv") #write.csv(matrixAllDn, file="matrixAllDn.csv") #write.csv(matrixAllTn1, file="matrixAllTn1.csv") #lines(xout, rowMeans(matrixAllTn2), lty=2, col=5) legend("bottomright", c(expression(B[n]), expression(D[n]), expression(W[n]) , expression(S[n]), expression(J[n]), expression(G[n]), expression(T[n])), lty=c(1,2,3, 4, 5, 6, 7), col=c(1,1,1,1, 1, 1,1), merge=TRUE, ncol=3)#, bg= 'gray90') #initial.dir<-"C:\\Users\\dimitrios.bagkavos\\Documents\\R\\Independence\\Test small samples\\n=40" #setwd(initial.dir) #dev.copy2eps("n=40hfactor=0.69.eps",width=6,height=6) } set.seed(155) iterations<-100 # number of correlation points in [0,1] It2<-40 # average the results of It2 replications #Cut off values for B_n by sample size: # n=20, a=0.90, 95, 99: 0.0557, 0.0704, 0.1072 respectively # n=30, a=0.90, 95, 99: 0.0528, 0.0665, 0.1006 respectively # n=40, a=0.90, 95, 99: 0.0516, 0.0649, 0.0979 respectively # n=80, a=0.90, 95, 99: 0.0493, 0.0617, 0.0923 respectively #a=90% SampleSi<-60 Simulation(SampleSi, iterations, It2, 1.5, 0.95, 0.0627, 0.05) #0.0649 #0.0354 inst. of 0.05 p.val